STEP 3 - Review results from DTM_inferLDATopics_LabelCorpus

This notebook: 1. Reads in labeled outputs from 06_pq_model.Rmd (“06_pq_labels.csv”) 2. Joins “06_pq_labels.csv” with “01_pq_metaclean.csv” 3. Subsets dataset based on labels and percent probabilities 4. Writes subsets to CSVs

Notes

  1. “SUBSETTING” keyword for searching 06_pq_label_review.Rmd
#load data
# 01_pq_metaclean.csv
pq_metaclean <- data.table::fread('Data/02_Working/01_pq_metaclean.csv')
# read in columns as characters so that doc id does not read in as numeric
# 04_pq_labels.csv
pq_labels <- data.table::fread(paste0('Data/02_Working/',rFileModelNum,'_pq_labels.csv'), colClasses = 'character')

# Head displays the first 6 rows of the data.table
#head(pq_metaclean)
#head(pq_labels)
# displays column names
print("pq_metaclean columns:")
[1] "pq_metaclean columns:"
names(pq_metaclean)
 [1] "Title"                   "Publication title"       "Publication year"        "Document URL"            "Full text"              
 [6] "Links"                   "Section"                 "Publication subject"     "ISSN"                    "Copyright"              
[11] "Abstract"                "Publication info"        "Last updated"            "Place of publication"    "Location"               
[16] "Author"                  "Publisher"               "Identifier / keyword"    "Source type"             "ProQuest document ID"   
[21] "Country of publication"  "Language of publication" "Publication date"        "Subject"                 "Database"               
[26] "Document type"          
print("")
[1] ""
print("pq_labels columns:")
[1] "pq_labels columns:"
names(pq_labels)
[1] "document" "topic"    "val"     
nrow(pq_metaclean);nrow(pq_labels)
[1] 6239
[1] 4031

join cleaned dataset with labels

# join tables
# inner_join because pq_metaclean was subset based on topic 2 when the model was re-ran in "04_pq_model.Rmd"
# so we only want where the Proquest ID exists in both the original dataset and the labels.
pq_metajoin <- pq_labels %>% 
  inner_join(pq_metaclean, by = c("document" = "ProQuest document ID"))

pq_metajoin <- pq_metajoin %>%
  rename(`ProQuest document ID` = document)

nrow(pq_metajoin);head(pq_metajoin)
[1] 4030
pq_empty<-pq_metajoin[pq_metajoin$topic == "" | is.na(pq_metajoin$topic),]
nrow(pq_empty)
[1] 0
# write labeled corpus to CSV
outputFileName = "pq_topics"
outputFile = paste(outputFolder,rFileNum,"_",outputFileName,".xlsx",sep="")
if (!file.exists(outputFile) | overwrite) {
  write.xlsx(pq_metajoin, outputFile, row.names=FALSE)
  #write.htmltable(pq_metajoin,title=outputFile, outputFile, sortby="topic","val")
}

Code now ready for SUBSETTING if you want to skip the following sections –

exploratory data analysis of topics generated

Do we want to keep both topics, or drop one of the topics and dig deeper into the other topic?

number of topics by publication title

plotTitle = "Count of Articles by Topic and Publication Title"

# count by pub title and topic
count_topic_pubtitle <-pq_metajoin %>%
  group_by(`Publication title`,topic) %>% 
  summarise(n= n()) %>%
  arrange(as.numeric(topic),as.numeric(n))
`summarise()` has grouped output by 'Publication title'. You can override using the `.groups` argument.
count_topic_pubtitle

# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_pubtitle.png"))
png(outputPNGFileName,height=6,width=12, units='in', res=300)
ggplot(count_topic_pubtitle, aes(x = as.factor(as.numeric(topic)), y = n, fill = `Publication title`, label = n )) +
  geom_bar(stat = "identity") +
  geom_text(size = 3, position = position_stack(vjust = 0.5)) +
  ggtitle(plotTitle) +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlab("Topic") +
  ylab("Article Count") +
  labs(fill = "Topic")
print(paste("Plot saved as:",outputPNGFileName))
[1] "Plot saved as: Images//07_pq_review/count_topic_pubtitle.png"
dev.off()
null device 
          1 
knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_pubtitle.png"))

number of topics by location

Count articles by topic and year

Important to consider the total articles/year in addition to the raw count of articles per topic. Peaks in articles may simply be due to an overall increase of articles for a particular year.

# article count per year and topic
count_topic_year <-pq_metajoin %>%
  group_by(`Publication year`,topic) %>% 
  summarise(n= n()) %>%
  arrange(as.numeric(topic),as.numeric(`Publication year`))
`summarise()` has grouped output by 'Publication year'. You can override using the `.groups` argument.
count_topic_year

# article count per year
count_year <-pq_metajoin %>%
  group_by(`Publication year`) %>% 
  summarise(perYear= n()) %>%
  arrange(as.numeric(`Publication year`))

count_topic_year<-count_topic_year %>%
  left_join(count_year, by = "Publication year")

count_topic_year<-count_topic_year %>%
  mutate(ratio = round(n/perYear,2))

plotTitle = "Count of Articles by Topic and Year"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_year.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = n)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("Article Count") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
[1] "Plot saved as: Images//07_pq_review/count_topic_year.png"
dev.off()
null device 
          1 
plotTitle = "Count of Articles by Topic and Year with Year Total"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_year_tot.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = n)) +
  geom_line(linetype = "dashed", color="black", aes(y = perYear)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("Article Count") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
[1] "Plot saved as: Images//07_pq_review/count_topic_year_tot.png"
dev.off()
null device 
          1 
plotTitle = "Ratio of Articles by Topic and Year"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("ratio_topic_year.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = ratio)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("(Article Count)/(Article Total)") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
[1] "Plot saved as: Images//07_pq_review/ratio_topic_year.png"
dev.off()
null device 
          1 
knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_year.png"))

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_year_tot.png"))

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/ratio_topic_year.png"))

Coherence score by topic

knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/coherence_score_topic.png"))

Wordclouds

knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/1_lda_topic_wc.png"))

knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/2_lda_topic_wc.png"))

Count of Article by Topic

# bar chart
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/topic_count_bar.png"))


# pie chart
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/topic_count_pie.png"))

Count of Probabilities by Percentage

knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/prob_count.png"))

Count of Topic Probabilities by Percentage

knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/prob_topic_count.png"))

#5. Visualising of topics in a dendrogram - not enough topics (<2)

# knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/hclust_dendrogram.png"))

START SUBSETTING - Subset Function

topic_subset_csv <- function(outputFolderName, rFileNum, outputFileName, inputCorpus, minPerc, maxPerc, theTopic, overwrite)

  • outputFolderName = folder to save to – i.e., “Data/02_Working/” (this is set at the top of the code where the libraries are loaded/installed)
  • rFileNum = The number in the r file – i.e., “05” (this is set at the top of the code where the libraries are loaded/installed)
  • outputFileName = the file name to save as – i.e., “pq_topic5_90perc” (this needs to be set at least 1 line above where the function is ran)
  • inputCorpus = the labeled corpus – i.e., pq_metajoin (this is created shortly after where libraries are loaded/installed in review.Rmd by joining pq_metaclean with pq_labels)
  • minPerc = minimum percentage – i.e., 0.9 (this needs to be set at least 1 line above where the function is ran)
  • maxPerc = maximum percentage – i.e., 1 (this needs to be set at least 1 line above where the function is ran)
  • theTopic = the topic category – i.e., “1” (this needs to be set at least 1 line above where the function is ran)
  • overwrite = whether to overwrite the file if it already exists – i.e., FALSE (this needs to be set at least 1 line above where the function is ran)

Subset topic 1

# subset corpus to unique identifier & full text of article
# investigate topic 1

outputFileName_1t <- "pq_topic1"
minPerc <- 0
maxPerc <- 1
theTopic <- "1"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_1t, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

Already exists, loading: pq_topic1 
head(pq_topic1_subset);nrow(pq_topic1_subset)
[1] 1025

Subset topic 2

# subset corpus to unique identifier & full text of article
# investigate topic 2

outputFileName_1t <- "pq_topic2"
minPerc <- 0
maxPerc <- 1
theTopic <- "2"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_1t, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

Already exists, loading: pq_topic2 
head(pq_topic1_subset);nrow(pq_topic1_subset)
[1] 3005

Subset topic 1, 90%

# subset corpus to unique identifier & full text of article
# investigate topic 1, 90%
outputFile_1t90perc <- "pq_topic1_90perc"
minPerc <- 0.9
maxPerc <- 1
theTopic <- "1"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_90perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_1t90perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

Already exists, loading: pq_topic1_90perc 
head(pq_topic1_90perc);nrow(pq_topic1_90perc)
[1] 733

Subset topic 2, 90%

# subset corpus to unique identifier & full text of article
# investigate topic 2, 90%
outputFile_5t90perc <- "pq_topic2_90perc"
minPerc <- 0.9
maxPerc <- 1
theTopic <- "2"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic5_90perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_5t90perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

Already exists, loading: pq_topic2_90perc 
head(pq_topic5_90perc);nrow(pq_topic5_90perc)
[1] 1994

Subset 50-60%-ers

# subset corpus to unique identifier & full text of article
# investigate all topics, 50-60%
outputFile_56perc <- "pq_56perc"
minPerc <- 0.5
maxPerc <- 0.6
theTopic <- "all"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_56perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_56perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

Already exists, loading: pq_56perc 
head(pq_56perc);nrow(pq_56perc)
[1] 224

Subset 0-60%-ers

# subset corpus to unique identifier & full text of article
# investigate all topics, 0-60%
# if within the function you write overwrite=TRUE, then the output CSV file will be re-written. 
# overwrite, outputFolder, rFileName are set in the first chunk of code


outputFileName_06perc <- "pq_06perc"
minPerc <- 0
maxPerc <- 0.6
theTopic <- "all"

## e.g, minPerc = 0 & maxPerc = 0.6 is zero to 0.59999999 ...
pq_perc06_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_06perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

Already exists, loading: pq_06perc 
head(pq_perc06_subset);nrow(pq_topic1_subset)
[1] 3005
unique(pq_perc06_subset$topic)
[1] "2" "1"
min(pq_perc06_subset$val)
[1] "0.50066"
max(pq_perc06_subset$val)
[1] "0.59996"

END SUBSETTING

explore term frequencies by year - how do words in our corpus change over time?

Referenced walk-through from: https://cran.r-project.org/web/packages/tidytext/vignettes/tidying_casting.html Are articles normalized based on the number of articles per year

# subset corpus to unique identifier & year
pq_time <- pq_metaclean %>% 
  select(`ProQuest document ID`, `Publication year`)

# tokens generated from 06_pq_model.Rmd
outputTokenFile = paste0(rFileModelNum,"_tokens.RData")
tokensFileName = file.path(outputFolder,outputTokenFile)

load(file=tokensFileName)

tokens <- tokens %>% 
  full_join(pq_time, by = c("ProQuest document ID" = "ProQuest document ID")) %>%
  rename(Year = `Publication year`) %>%
  rename(pq_id = `ProQuest document ID`) %>%
  mutate_at(vars(Year), funs(as.integer))
`funs()` was deprecated in dplyr 0.8.0.
Please use a list of either functions or lambdas: 

  # Simple named list: 
  list(mean = mean, median = median)

  # Auto named with `tibble::lst()`: 
  tibble::lst(mean, median)

  # Using lambdas
  list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
rm(pq_time)

outputFreqsFile = paste0(rFileNum,"_tokens_freq")
tokens_freq<-create_ifnot_tokens_freq(outputFolder, outputFreqsFile, tokens, overwrite)

Already exists, loading: 07_tokens_freq 
outputFreqModelFile = paste0(rFileNum,"_freq_models")
freq_models <- create_ifnot_freqmodels(outputFolder, outputFreqModelFile, tokens_freq, overwrite)

Already exists, loading: 07_freq_models 

model results

freq_models %>%
  filter(term == "Year") %>%
  arrange(desc(abs(estimate)))

Models displayed as a volcano plot, which compares the effect size with the significance

# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("word_change_over_time.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)

freq_models %>%
  mutate(adjusted.p.value = p.adjust(p.value)) %>%
  ggplot(aes(estimate, adjusted.p.value)) +
  geom_point() +
  scale_y_log10() +
  geom_text(aes(label = word), vjust = 1, hjust = 1,
            check_overlap = TRUE) +
  xlab("Estimated change over time") +
  ylab("Adjusted p-value")


print(paste("Plot saved as:",outputPNGFileName))
[1] "Plot saved as: Images//07_pq_review/word_change_over_time.png"
dev.off()
null device 
          1 
knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/word_change_over_time.png"))

Top 6 terms that have changed in frequency over time

# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("top_word_change_over_time.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)

freq_models %>%
  top_n(6, abs(estimate)) %>%
  inner_join(tokens_freq) %>%
  ggplot(aes(Year, percent)) +
  geom_point() +
  geom_smooth() +
  facet_wrap(~ word) +
  scale_y_continuous(labels = percent_format()) +
  ylab("Frequency of word in speech")
Joining, by = "word"
print(paste("Plot saved as:",outputPNGFileName))
[1] "Plot saved as: Images//07_pq_review/top_word_change_over_time.png"
dev.off()
null device 
          1 
knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/top_word_change_over_time.png"))

---
title: "pq_label_review"
output:
  pdf_document: default
  html_notebook: default
---

# STEP 3 - Review results from DTM_inferLDATopics_LabelCorpus

This notebook:
1. Reads in labeled outputs from 06_pq_model.Rmd ("06_pq_labels.csv")
2. Joins "06_pq_labels.csv" with "01_pq_metaclean.csv"
3. Subsets dataset based on labels and percent probabilities
4. Writes subsets to CSVs

# Notes
1. "SUBSETTING" keyword for searching 06_pq_label_review.Rmd

```{r, echo=FALSE, results="hide", messages=FALSE}
# Load and Install Libraries
source("SEN_functions.R")

## Check libraries & install
LibraryList<-c("stringr","data.table","dplyr","tidyr","magrittr","NLP","tidytext","tm","ggplot2",
               "scales", "ggwordcloud","textmineR","digest", "broom", "stringi", "xlsx")
install_or_load_pack(LibraryList)


outputFolder = "Data/02_Working/"
outputImgFolder = "Images/"
rFileNum = "07"
rFileModelNum = "06"

outputPngFolder<-file.path(outputImgFolder, paste0(rFileNum,"_pq_review"))
if (!dir.exists(outputPngFolder)) dir.create(outputPngFolder)

overwrite = FALSE
```


```{r}
#load data
# 01_pq_metaclean.csv
pq_metaclean <- data.table::fread('Data/02_Working/01_pq_metaclean.csv')
# read in columns as characters so that doc id does not read in as numeric
# 04_pq_labels.csv
pq_labels <- data.table::fread(paste0('Data/02_Working/',rFileModelNum,'_pq_labels.csv'), colClasses = 'character')

# Head displays the first 6 rows of the data.table
#head(pq_metaclean)
#head(pq_labels)
```


```{r}
# displays column names
print("pq_metaclean columns:")
names(pq_metaclean)
print("")
print("pq_labels columns:")
names(pq_labels)
nrow(pq_metaclean);nrow(pq_labels)
```

# join cleaned dataset with labels
```{r}
# join tables
# inner_join because pq_metaclean was subset based on topic 2 when the model was re-ran in "04_pq_model.Rmd"
# so we only want where the Proquest ID exists in both the original dataset and the labels.
pq_metajoin <- pq_labels %>% 
  inner_join(pq_metaclean, by = c("document" = "ProQuest document ID"))

pq_metajoin <- pq_metajoin %>%
  rename(`ProQuest document ID` = document)

nrow(pq_metajoin);head(pq_metajoin)

pq_empty<-pq_metajoin[pq_metajoin$topic == "" | is.na(pq_metajoin$topic),]
nrow(pq_empty)

# write labeled corpus to CSV
outputFileName = "pq_topics"
outputFile = paste(outputFolder,rFileNum,"_",outputFileName,".xlsx",sep="")
if (!file.exists(outputFile) | overwrite) {
  write.xlsx(pq_metajoin, outputFile, row.names=FALSE)
  #write.htmltable(pq_metajoin,title=outputFile, outputFile, sortby="topic","val")
}
```
# Code now ready for SUBSETTING if you want to skip the following sections --


# exploratory data analysis of topics generated
Do we want to keep both topics, or drop one of the topics and dig deeper into the other topic?

# number of topics by publication title
```{r, messages=FALSE}
plotTitle = "Count of Articles by Topic and Publication Title"

# count by pub title and topic
count_topic_pubtitle <-pq_metajoin %>%
  group_by(`Publication title`,topic) %>% 
  summarise(n= n()) %>%
  arrange(as.numeric(topic),as.numeric(n))

count_topic_pubtitle

# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_pubtitle.png"))
png(outputPNGFileName,height=6,width=12, units='in', res=300)
ggplot(count_topic_pubtitle, aes(x = as.factor(as.numeric(topic)), y = n, fill = `Publication title`, label = n )) +
  geom_bar(stat = "identity") +
  geom_text(size = 3, position = position_stack(vjust = 0.5)) +
  ggtitle(plotTitle) +
  theme(plot.title = element_text(hjust = 0.5)) +
  xlab("Topic") +
  ylab("Article Count") +
  labs(fill = "Topic")
print(paste("Plot saved as:",outputPNGFileName))
dev.off()

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_pubtitle.png"))
```

# number of topics by location

# Count articles by topic and year
Important to consider the total articles/year in addition to the raw count of articles per topic.
Peaks in articles may simply be due to an overall increase of articles for a particular year.
```{r, messages=FALSE}
# article count per year and topic
count_topic_year <-pq_metajoin %>%
  group_by(`Publication year`,topic) %>% 
  summarise(n= n()) %>%
  arrange(as.numeric(topic),as.numeric(`Publication year`))

count_topic_year

# article count per year
count_year <-pq_metajoin %>%
  group_by(`Publication year`) %>% 
  summarise(perYear= n()) %>%
  arrange(as.numeric(`Publication year`))

count_topic_year<-count_topic_year %>%
  left_join(count_year, by = "Publication year")

count_topic_year<-count_topic_year %>%
  mutate(ratio = round(n/perYear,2))

plotTitle = "Count of Articles by Topic and Year"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_year.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = n)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("Article Count") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
dev.off()

plotTitle = "Count of Articles by Topic and Year with Year Total"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("count_topic_year_tot.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = n)) +
  geom_line(linetype = "dashed", color="black", aes(y = perYear)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("Article Count") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
dev.off()

plotTitle = "Ratio of Articles by Topic and Year"
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("ratio_topic_year.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)
ggplot(count_topic_year, aes(x = `Publication year`, colour = as.factor(as.numeric(topic)))) +
  geom_line(aes(y = ratio)) +
  scale_x_continuous(breaks=seq(min(na.omit(count_topic_year$`Publication year`)), max(na.omit(count_topic_year$`Publication year`)),1)) +
  theme(axis.text.x = element_text(angle = 90),
        plot.title = element_text(hjust = 0.5)) +
  facet_wrap(vars(as.numeric(topic))) +
  ggtitle(plotTitle) +
  xlab("Publication Year") +
  ylab("(Article Count)/(Article Total)") +
  guides(colour=FALSE) 
print(paste("Plot saved as:",outputPNGFileName))
dev.off()

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_year.png"))
knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/count_topic_year_tot.png"))
knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/ratio_topic_year.png"))
```

# Coherence score by topic
```{r, out.width="50%", fig.pos="h"}
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/coherence_score_topic.png"))
```

# Wordclouds
```{r, out.width="50%", fig.pos="h"}
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/1_lda_topic_wc.png"))
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/2_lda_topic_wc.png"))
```
# Count of Article by Topic
```{r, out.width="50%", fig.pos="h"}
# bar chart
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/topic_count_bar.png"))

# pie chart
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/topic_count_pie.png"))
```

# Count of Probabilities by Percentage
```{r, out.width="50%", fig.pos="h"}
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/prob_count.png"))
```
# Count of Topic Probabilities by Percentage
```{r, out.width="50%", fig.pos="h"}
knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/prob_topic_count.png"))
```
#5. Visualising of topics in a dendrogram - not enough topics (<2)
```{r, out.width="50%", fig.pos="h"}
# knitr::include_graphics(paste0("Images/",rFileModelNum,"_pq_model/hclust_dendrogram.png"))

```

# START SUBSETTING - Subset Function
#### topic_subset_csv <- function(outputFolderName, rFileNum, outputFileName, inputCorpus, minPerc, maxPerc, theTopic, overwrite)

* outputFolderName = folder to save to -- i.e., "Data/02_Working/" (this is set at the top of the code where the libraries are loaded/installed)
* rFileNum = The number in the r file -- i.e., "05" (this is set at the top of the code where the libraries are loaded/installed)
* outputFileName = the file name to save as -- i.e., "pq_topic5_90perc" (this needs to be set at least 1 line above where the function is ran)
* inputCorpus = the labeled corpus -- i.e., pq_metajoin (this is created shortly after where libraries are loaded/installed in review.Rmd by joining pq_metaclean with pq_labels)
* minPerc = minimum percentage -- i.e., 0.9 (this needs to be set at least 1 line above where the function is ran)
* maxPerc = maximum percentage -- i.e., 1 (this needs to be set at least 1 line above where the function is ran)
* theTopic = the topic category -- i.e., "1" (this needs to be set at least 1 line above where the function is ran)
* overwrite = whether to overwrite the file if it already exists -- i.e., FALSE (this needs to be set at least 1 line above where the function is ran)

# Subset topic 1
```{r}
# subset corpus to unique identifier & full text of article
# investigate topic 1

outputFileName_1t <- "pq_topic1"
minPerc <- 0
maxPerc <- 1
theTopic <- "1"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_1t, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_topic1_subset);nrow(pq_topic1_subset)
```
# Subset topic 2
```{r}
# subset corpus to unique identifier & full text of article
# investigate topic 2

outputFileName_1t <- "pq_topic2"
minPerc <- 0
maxPerc <- 1
theTopic <- "2"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_1t, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_topic1_subset);nrow(pq_topic1_subset)
```

# Subset topic 1, 90%
```{r}
# subset corpus to unique identifier & full text of article
# investigate topic 1, 90%
outputFile_1t90perc <- "pq_topic1_90perc"
minPerc <- 0.9
maxPerc <- 1
theTopic <- "1"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic1_90perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_1t90perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_topic1_90perc);nrow(pq_topic1_90perc)
```


# Subset topic 2, 90%
```{r}
# subset corpus to unique identifier & full text of article
# investigate topic 2, 90%
outputFile_5t90perc <- "pq_topic2_90perc"
minPerc <- 0.9
maxPerc <- 1
theTopic <- "2"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_topic5_90perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_5t90perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_topic5_90perc);nrow(pq_topic5_90perc)
```


# Subset 50-60%-ers
```{r}
# subset corpus to unique identifier & full text of article
# investigate all topics, 50-60%
outputFile_56perc <- "pq_56perc"
minPerc <- 0.5
maxPerc <- 0.6
theTopic <- "all"

## e.g, minPerc = 0 & maxPerc = 0.4 is zero to 0.3999999999 ...
pq_56perc<-topic_subset_csv(outputFolder, rFileNum, outputFile_56perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_56perc);nrow(pq_56perc)
```

# Subset 0-60%-ers
```{r}
# subset corpus to unique identifier & full text of article
# investigate all topics, 0-60%
# if within the function you write overwrite=TRUE, then the output CSV file will be re-written. 
# overwrite, outputFolder, rFileName are set in the first chunk of code


outputFileName_06perc <- "pq_06perc"
minPerc <- 0
maxPerc <- 0.6
theTopic <- "all"

## e.g, minPerc = 0 & maxPerc = 0.6 is zero to 0.59999999 ...
pq_perc06_subset<-topic_subset_csv(outputFolder, rFileNum, outputFileName_06perc, pq_metajoin , minPerc, maxPerc, theTopic, overwrite)

head(pq_perc06_subset);nrow(pq_topic1_subset)
unique(pq_perc06_subset$topic)
min(pq_perc06_subset$val)
max(pq_perc06_subset$val)
```

# END SUBSETTING

# explore term frequencies by year - how do words in our corpus change over time?
Referenced walk-through from: https://cran.r-project.org/web/packages/tidytext/vignettes/tidying_casting.html
Are articles normalized based on the number of articles per year
```{r}
# subset corpus to unique identifier & year
pq_time <- pq_metaclean %>% 
  select(`ProQuest document ID`, `Publication year`)

# tokens generated from 06_pq_model.Rmd
outputTokenFile = paste0(rFileModelNum,"_tokens.RData")
tokensFileName = file.path(outputFolder,outputTokenFile)

load(file=tokensFileName)

tokens <- tokens %>% 
  full_join(pq_time, by = c("ProQuest document ID" = "ProQuest document ID")) %>%
  rename(Year = `Publication year`) %>%
  rename(pq_id = `ProQuest document ID`) %>%
  mutate_at(vars(Year), funs(as.integer))
rm(pq_time)

outputFreqsFile = paste0(rFileNum,"_tokens_freq")
tokens_freq<-create_ifnot_tokens_freq(outputFolder, outputFreqsFile, tokens, overwrite)

outputFreqModelFile = paste0(rFileNum,"_freq_models")
freq_models <- create_ifnot_freqmodels(outputFolder, outputFreqModelFile, tokens_freq, overwrite)
```

# model results
```{r}
freq_models %>%
  filter(term == "Year") %>%
  arrange(desc(abs(estimate)))
```

# Models displayed as a volcano plot, which compares the effect size with the significance
```{r}
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("word_change_over_time.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)

freq_models %>%
  mutate(adjusted.p.value = p.adjust(p.value)) %>%
  ggplot(aes(estimate, adjusted.p.value)) +
  geom_point() +
  scale_y_log10() +
  geom_text(aes(label = word), vjust = 1, hjust = 1,
            check_overlap = TRUE) +
  xlab("Estimated change over time") +
  ylab("Adjusted p-value")


print(paste("Plot saved as:",outputPNGFileName))
dev.off()

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/word_change_over_time.png"))
```

# Top 6 terms that have changed in frequency over time
```{r}
# save plot in png format
outputPNGFileName <- file.path(outputPngFolder,paste0("top_word_change_over_time.png"))
png(outputPNGFileName,height=5,width=15, units='in', res=300)

freq_models %>%
  top_n(6, abs(estimate)) %>%
  inner_join(tokens_freq) %>%
  ggplot(aes(Year, percent)) +
  geom_point() +
  geom_smooth() +
  facet_wrap(~ word) +
  scale_y_continuous(labels = percent_format()) +
  ylab("Frequency of word in speech")

print(paste("Plot saved as:",outputPNGFileName))
dev.off()

knitr::include_graphics(paste0("Images/",rFileNum,"_pq_review/top_word_change_over_time.png"))
```
